{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Machine Learning Pipeline - Feature Selection\n",
    "\n",
    "In this notebook, we pick up the transformed datasets that we saved in the previous notebook."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Reproducibility: Setting the seed\n",
    "\n",
    "With the aim to ensure reproducibility between runs of the same notebook, but also between the research and production environment, for each step that includes some element of randomness, it is extremely important that we **set the seed**."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# to handle datasets\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "# for plotting\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# to build the models\n",
    "from sklearn.linear_model import Lasso\n",
    "from sklearn.feature_selection import SelectFromModel\n",
    "\n",
    "# to visualise al the columns in the dataframe\n",
    "pd.pandas.set_option('display.max_columns', None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>MSSubClass</th>\n",
       "      <th>MSZoning</th>\n",
       "      <th>LotFrontage</th>\n",
       "      <th>LotArea</th>\n",
       "      <th>Street</th>\n",
       "      <th>Alley</th>\n",
       "      <th>LotShape</th>\n",
       "      <th>LandContour</th>\n",
       "      <th>Utilities</th>\n",
       "      <th>LotConfig</th>\n",
       "      <th>LandSlope</th>\n",
       "      <th>Neighborhood</th>\n",
       "      <th>Condition1</th>\n",
       "      <th>Condition2</th>\n",
       "      <th>BldgType</th>\n",
       "      <th>HouseStyle</th>\n",
       "      <th>OverallQual</th>\n",
       "      <th>OverallCond</th>\n",
       "      <th>YearBuilt</th>\n",
       "      <th>YearRemodAdd</th>\n",
       "      <th>RoofStyle</th>\n",
       "      <th>RoofMatl</th>\n",
       "      <th>Exterior1st</th>\n",
       "      <th>Exterior2nd</th>\n",
       "      <th>MasVnrType</th>\n",
       "      <th>MasVnrArea</th>\n",
       "      <th>ExterQual</th>\n",
       "      <th>ExterCond</th>\n",
       "      <th>Foundation</th>\n",
       "      <th>BsmtQual</th>\n",
       "      <th>BsmtCond</th>\n",
       "      <th>BsmtExposure</th>\n",
       "      <th>BsmtFinType1</th>\n",
       "      <th>BsmtFinSF1</th>\n",
       "      <th>BsmtFinType2</th>\n",
       "      <th>BsmtFinSF2</th>\n",
       "      <th>BsmtUnfSF</th>\n",
       "      <th>TotalBsmtSF</th>\n",
       "      <th>Heating</th>\n",
       "      <th>HeatingQC</th>\n",
       "      <th>CentralAir</th>\n",
       "      <th>Electrical</th>\n",
       "      <th>1stFlrSF</th>\n",
       "      <th>2ndFlrSF</th>\n",
       "      <th>LowQualFinSF</th>\n",
       "      <th>GrLivArea</th>\n",
       "      <th>BsmtFullBath</th>\n",
       "      <th>BsmtHalfBath</th>\n",
       "      <th>FullBath</th>\n",
       "      <th>HalfBath</th>\n",
       "      <th>BedroomAbvGr</th>\n",
       "      <th>KitchenAbvGr</th>\n",
       "      <th>KitchenQual</th>\n",
       "      <th>TotRmsAbvGrd</th>\n",
       "      <th>Functional</th>\n",
       "      <th>Fireplaces</th>\n",
       "      <th>FireplaceQu</th>\n",
       "      <th>GarageType</th>\n",
       "      <th>GarageYrBlt</th>\n",
       "      <th>GarageFinish</th>\n",
       "      <th>GarageCars</th>\n",
       "      <th>GarageArea</th>\n",
       "      <th>GarageQual</th>\n",
       "      <th>GarageCond</th>\n",
       "      <th>PavedDrive</th>\n",
       "      <th>WoodDeckSF</th>\n",
       "      <th>OpenPorchSF</th>\n",
       "      <th>EnclosedPorch</th>\n",
       "      <th>3SsnPorch</th>\n",
       "      <th>ScreenPorch</th>\n",
       "      <th>PoolArea</th>\n",
       "      <th>PoolQC</th>\n",
       "      <th>Fence</th>\n",
       "      <th>MiscFeature</th>\n",
       "      <th>MiscVal</th>\n",
       "      <th>MoSold</th>\n",
       "      <th>SaleType</th>\n",
       "      <th>SaleCondition</th>\n",
       "      <th>LotFrontage_na</th>\n",
       "      <th>MasVnrArea_na</th>\n",
       "      <th>GarageYrBlt_na</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.750000</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.461171</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.863636</td>\n",
       "      <td>0.4</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.6</td>\n",
       "      <td>0.777778</td>\n",
       "      <td>0.50</td>\n",
       "      <td>0.014706</td>\n",
       "      <td>0.049180</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.5</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.002835</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.673479</td>\n",
       "      <td>0.239935</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.00</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.559760</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.523250</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.375</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.416667</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.018692</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.430183</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.5</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.116686</td>\n",
       "      <td>0.032907</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.545455</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.750000</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.456066</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.363636</td>\n",
       "      <td>0.4</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.6</td>\n",
       "      <td>0.444444</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.360294</td>\n",
       "      <td>0.049180</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.6</td>\n",
       "      <td>0.6</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.03375</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.8</td>\n",
       "      <td>0.142807</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.114724</td>\n",
       "      <td>0.172340</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.00</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.434539</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.406196</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.375</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.457944</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.25</td>\n",
       "      <td>0.220028</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.5</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.75</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.636364</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.916667</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.394699</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.954545</td>\n",
       "      <td>0.4</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.00</td>\n",
       "      <td>0.6</td>\n",
       "      <td>0.888889</td>\n",
       "      <td>0.50</td>\n",
       "      <td>0.036765</td>\n",
       "      <td>0.098361</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.3</td>\n",
       "      <td>0.2</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.25750</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.5</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.080794</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.601951</td>\n",
       "      <td>0.286743</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.00</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.627205</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.586296</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.250</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.8</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.046729</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.50</td>\n",
       "      <td>0.406206</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.5</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.228705</td>\n",
       "      <td>0.149909</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.090909</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.750000</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.445002</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.454545</td>\n",
       "      <td>0.4</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.6</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.50</td>\n",
       "      <td>0.066176</td>\n",
       "      <td>0.163934</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.5</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.255670</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.018114</td>\n",
       "      <td>0.242553</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.00</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.566920</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.529943</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.375</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.250000</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.4</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.084112</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.50</td>\n",
       "      <td>0.362482</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.5</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.469078</td>\n",
       "      <td>0.045704</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.636364</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.75</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.750000</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.577658</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.363636</td>\n",
       "      <td>0.4</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.6</td>\n",
       "      <td>0.555556</td>\n",
       "      <td>0.50</td>\n",
       "      <td>0.323529</td>\n",
       "      <td>0.737705</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.6</td>\n",
       "      <td>0.7</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.17000</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.6</td>\n",
       "      <td>0.086818</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.434278</td>\n",
       "      <td>0.233224</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.75</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.549026</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.513216</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.375</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.416667</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.8</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.411215</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.50</td>\n",
       "      <td>0.406206</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.5</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.545455</td>\n",
       "      <td>0.666667</td>\n",
       "      <td>0.75</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   MSSubClass  MSZoning  LotFrontage  LotArea  Street  Alley  LotShape  \\\n",
       "0    0.750000      0.75     0.461171      0.0     1.0    1.0  0.333333   \n",
       "1    0.750000      0.75     0.456066      0.0     1.0    1.0  0.333333   \n",
       "2    0.916667      0.75     0.394699      0.0     1.0    1.0  0.000000   \n",
       "3    0.750000      0.75     0.445002      0.0     1.0    1.0  0.666667   \n",
       "4    0.750000      0.75     0.577658      0.0     1.0    1.0  0.333333   \n",
       "\n",
       "   LandContour  Utilities  LotConfig  LandSlope  Neighborhood  Condition1  \\\n",
       "0     1.000000        1.0        0.0        0.0      0.863636         0.4   \n",
       "1     0.333333        1.0        0.0        0.0      0.363636         0.4   \n",
       "2     0.333333        1.0        0.0        0.0      0.954545         0.4   \n",
       "3     0.666667        1.0        0.0        0.0      0.454545         0.4   \n",
       "4     0.333333        1.0        0.0        0.0      0.363636         0.4   \n",
       "\n",
       "   Condition2  BldgType  HouseStyle  OverallQual  OverallCond  YearBuilt  \\\n",
       "0         1.0      0.75         0.6     0.777778         0.50   0.014706   \n",
       "1         1.0      0.75         0.6     0.444444         0.75   0.360294   \n",
       "2         1.0      1.00         0.6     0.888889         0.50   0.036765   \n",
       "3         1.0      0.75         0.6     0.666667         0.50   0.066176   \n",
       "4         1.0      0.75         0.6     0.555556         0.50   0.323529   \n",
       "\n",
       "   YearRemodAdd  RoofStyle  RoofMatl  Exterior1st  Exterior2nd  MasVnrType  \\\n",
       "0      0.049180        0.0       0.0          1.0          1.0    0.333333   \n",
       "1      0.049180        0.0       0.0          0.6          0.6    0.666667   \n",
       "2      0.098361        1.0       0.0          0.3          0.2    0.666667   \n",
       "3      0.163934        0.0       0.0          1.0          1.0    0.333333   \n",
       "4      0.737705        0.0       0.0          0.6          0.7    0.666667   \n",
       "\n",
       "   MasVnrArea  ExterQual  ExterCond  Foundation  BsmtQual  BsmtCond  \\\n",
       "0     0.00000   0.666667        0.5         1.0  0.666667  0.666667   \n",
       "1     0.03375   0.666667        0.5         0.5  0.333333  0.666667   \n",
       "2     0.25750   1.000000        0.5         1.0  1.000000  0.666667   \n",
       "3     0.00000   0.666667        0.5         1.0  0.666667  0.666667   \n",
       "4     0.17000   0.333333        0.5         0.5  0.333333  0.666667   \n",
       "\n",
       "   BsmtExposure  BsmtFinType1  BsmtFinSF1  BsmtFinType2  BsmtFinSF2  \\\n",
       "0      0.666667           1.0    0.002835           0.0         0.0   \n",
       "1      0.000000           0.8    0.142807           0.0         0.0   \n",
       "2      0.000000           1.0    0.080794           0.0         0.0   \n",
       "3      1.000000           1.0    0.255670           0.0         0.0   \n",
       "4      0.000000           0.6    0.086818           0.0         0.0   \n",
       "\n",
       "   BsmtUnfSF  TotalBsmtSF  Heating  HeatingQC  CentralAir  Electrical  \\\n",
       "0   0.673479     0.239935      1.0       1.00         1.0         1.0   \n",
       "1   0.114724     0.172340      1.0       1.00         1.0         1.0   \n",
       "2   0.601951     0.286743      1.0       1.00         1.0         1.0   \n",
       "3   0.018114     0.242553      1.0       1.00         1.0         1.0   \n",
       "4   0.434278     0.233224      1.0       0.75         1.0         1.0   \n",
       "\n",
       "   1stFlrSF  2ndFlrSF  LowQualFinSF  GrLivArea  BsmtFullBath  BsmtHalfBath  \\\n",
       "0  0.559760       0.0           0.0   0.523250      0.000000           0.0   \n",
       "1  0.434539       0.0           0.0   0.406196      0.333333           0.0   \n",
       "2  0.627205       0.0           0.0   0.586296      0.333333           0.0   \n",
       "3  0.566920       0.0           0.0   0.529943      0.333333           0.0   \n",
       "4  0.549026       0.0           0.0   0.513216      0.000000           0.0   \n",
       "\n",
       "   FullBath  HalfBath  BedroomAbvGr  KitchenAbvGr  KitchenQual  TotRmsAbvGrd  \\\n",
       "0  0.666667       0.0         0.375      0.333333     0.666667      0.416667   \n",
       "1  0.333333       0.5         0.375      0.333333     0.666667      0.250000   \n",
       "2  0.666667       0.0         0.250      0.333333     1.000000      0.333333   \n",
       "3  0.666667       0.0         0.375      0.333333     0.666667      0.250000   \n",
       "4  0.666667       0.0         0.375      0.333333     0.333333      0.416667   \n",
       "\n",
       "   Functional  Fireplaces  FireplaceQu  GarageType  GarageYrBlt  GarageFinish  \\\n",
       "0         1.0    0.000000          0.0        0.75     0.018692           1.0   \n",
       "1         1.0    0.000000          0.0        0.75     0.457944           0.5   \n",
       "2         1.0    0.333333          0.8        0.75     0.046729           0.5   \n",
       "3         1.0    0.333333          0.4        0.75     0.084112           0.5   \n",
       "4         1.0    0.333333          0.8        0.75     0.411215           0.5   \n",
       "\n",
       "   GarageCars  GarageArea  GarageQual  GarageCond  PavedDrive  WoodDeckSF  \\\n",
       "0        0.75    0.430183         0.5         0.5         1.0    0.116686   \n",
       "1        0.25    0.220028         0.5         0.5         1.0    0.000000   \n",
       "2        0.50    0.406206         0.5         0.5         1.0    0.228705   \n",
       "3        0.50    0.362482         0.5         0.5         1.0    0.469078   \n",
       "4        0.50    0.406206         0.5         0.5         1.0    0.000000   \n",
       "\n",
       "   OpenPorchSF  EnclosedPorch  3SsnPorch  ScreenPorch  PoolArea  PoolQC  \\\n",
       "0     0.032907            0.0        0.0          0.0       0.0     0.0   \n",
       "1     0.000000            0.0        0.0          0.0       0.0     0.0   \n",
       "2     0.149909            0.0        0.0          0.0       0.0     0.0   \n",
       "3     0.045704            0.0        0.0          0.0       0.0     0.0   \n",
       "4     0.000000            0.0        1.0          0.0       0.0     0.0   \n",
       "\n",
       "   Fence  MiscFeature  MiscVal    MoSold  SaleType  SaleCondition  \\\n",
       "0   0.00          1.0      0.0  0.545455  0.666667           0.75   \n",
       "1   0.75          1.0      0.0  0.636364  0.666667           0.75   \n",
       "2   0.00          1.0      0.0  0.090909  0.666667           0.75   \n",
       "3   0.00          1.0      0.0  0.636364  0.666667           0.75   \n",
       "4   0.00          1.0      0.0  0.545455  0.666667           0.75   \n",
       "\n",
       "   LotFrontage_na  MasVnrArea_na  GarageYrBlt_na  \n",
       "0             0.0            0.0             0.0  \n",
       "1             0.0            0.0             0.0  \n",
       "2             0.0            0.0             0.0  \n",
       "3             1.0            0.0             0.0  \n",
       "4             0.0            0.0             0.0  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# load the train and test set with the engineered variables\n",
    "\n",
    "# we built and saved these datasets in the previous lecture.\n",
    "# If you haven't done so, go ahead and check the previous notebook\n",
    "# to find out how to create these datasets\n",
    "\n",
    "X_train = pd.read_csv('xtrain.csv')\n",
    "X_test = pd.read_csv('xtest.csv')\n",
    "\n",
    "X_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>SalePrice</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>12.211060</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>11.887931</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>12.675764</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>12.278393</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>12.103486</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   SalePrice\n",
       "0  12.211060\n",
       "1  11.887931\n",
       "2  12.675764\n",
       "3  12.278393\n",
       "4  12.103486"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# load the target (remember that the target is log transformed)\n",
    "y_train = pd.read_csv('ytrain.csv')\n",
    "y_test = pd.read_csv('ytest.csv')\n",
    "\n",
    "y_train.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Feature Selection\n",
    "\n",
    "Let's go ahead and select a subset of the most predictive features. There is an element of randomness in the Lasso regression, so remember to set the seed."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SelectFromModel(estimator=Lasso(alpha=0.001, random_state=0))"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# We will do the model fitting and feature selection\n",
    "# altogether in a few lines of code\n",
    "\n",
    "# first, we specify the Lasso Regression model, and we\n",
    "# select a suitable alpha (equivalent of penalty).\n",
    "# The bigger the alpha the less features that will be selected.\n",
    "\n",
    "# Then we use the selectFromModel object from sklearn, which\n",
    "# will select automatically the features which coefficients are non-zero\n",
    "\n",
    "# remember to set the seed, the random state in this function\n",
    "sel_ = SelectFromModel(Lasso(alpha=0.001, random_state=0))\n",
    "\n",
    "# train Lasso model and select features\n",
    "sel_.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "36"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sel_.get_support().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ True,  True,  True, False, False, False,  True,  True, False,\n",
       "        True, False,  True, False, False, False, False,  True,  True,\n",
       "       False,  True,  True, False,  True, False, False, False,  True,\n",
       "       False,  True,  True, False,  True,  True, False, False, False,\n",
       "       False, False, False,  True,  True, False,  True,  True, False,\n",
       "        True,  True, False, False,  True, False, False,  True,  True,\n",
       "        True,  True,  True, False, False,  True,  True,  True, False,\n",
       "       False,  True,  True, False, False, False,  True, False, False,\n",
       "       False, False, False, False, False,  True, False, False, False])"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# let's visualise those features that were selected.\n",
    "# (selected features marked with True)\n",
    "\n",
    "sel_.get_support()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "total features: 81\n",
      "selected features: 36\n",
      "features with coefficients shrank to zero: 45\n"
     ]
    }
   ],
   "source": [
    "# let's print the number of total and selected features\n",
    "\n",
    "# this is how we can make a list of the selected features\n",
    "selected_feats = X_train.columns[(sel_.get_support())]\n",
    "\n",
    "# let's print some stats\n",
    "print('total features: {}'.format((X_train.shape[1])))\n",
    "print('selected features: {}'.format(len(selected_feats)))\n",
    "print('features with coefficients shrank to zero: {}'.format(\n",
    "    np.sum(sel_.estimator_.coef_ == 0)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotShape', 'LandContour',\n",
       "       'LotConfig', 'Neighborhood', 'OverallQual', 'OverallCond',\n",
       "       'YearRemodAdd', 'RoofStyle', 'Exterior1st', 'ExterQual', 'Foundation',\n",
       "       'BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'HeatingQC', 'CentralAir',\n",
       "       '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'BsmtFullBath', 'HalfBath',\n",
       "       'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces',\n",
       "       'FireplaceQu', 'GarageFinish', 'GarageCars', 'GarageArea', 'PavedDrive',\n",
       "       'WoodDeckSF', 'ScreenPorch', 'SaleCondition'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# print the selected features\n",
    "selected_feats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.Series(selected_feats).to_csv('selected_features.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# Additional Resources\n",
    "\n",
    "- [Feature Selection for Machine Learning](https://www.trainindata.com/p/feature-selection-for-machine-learning) - Online Course\n",
    "- [Feature Selection in Machine Learning with Python](https://leanpub.com/feature-selection-in-machine-learning/) - Book\n",
    "- [Feature Selection for Machine Learning: A comprehensive Overview](https://www.blog.trainindata.com/feature-selection-for-machine-learning/) - Article"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "feml",
   "language": "python",
   "name": "feml"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.2"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {
    "height": "583px",
    "left": "0px",
    "right": "1324px",
    "top": "107px",
    "width": "212px"
   },
   "toc_section_display": "block",
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
